import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.offline as pyo
pyo.init_notebook_mode()
import plotly.graph_objs as go
from plotly import tools
from plotly.subplots import make_subplots
import plotly.offline as py
import plotly.express as px
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
df = pd.read_csv('corona.csv')
df
| Confirmed | Deaths | Recovered | Active | New cases | New deaths | New recovered | Deaths / 100 Cases | Recovered / 100 Cases | Confirmed last week | 1 week change | 1 week % increase | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 36263 | 1269 | 25198 | 9796 | 106 | 10 | 18 | 4 | 69 | 35526 | 737 | 2 |
| 1 | 4880 | 144 | 2745 | 1991 | 117 | 6 | 63 | 3 | 56 | 4171 | 709 | 17 |
| 2 | 27973 | 1163 | 18837 | 7973 | 616 | 8 | 749 | 4 | 67 | 23691 | 4282 | 18 |
| 3 | 907 | 52 | 803 | 52 | 10 | 0 | 0 | 6 | 89 | 884 | 23 | 3 |
| 4 | 950 | 41 | 242 | 667 | 18 | 1 | 0 | 4 | 25 | 749 | 201 | 27 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 182 | 10621 | 78 | 3752 | 6791 | 152 | 2 | 0 | 1 | 35 | 8916 | 1705 | 19 |
| 183 | 10 | 1 | 8 | 1 | 0 | 0 | 0 | 10 | 80 | 10 | 0 | 0 |
| 184 | 1691 | 483 | 833 | 375 | 10 | 4 | 36 | 29 | 49 | 1619 | 72 | 4 |
| 185 | 4552 | 140 | 2815 | 1597 | 71 | 1 | 465 | 3 | 62 | 3326 | 1226 | 37 |
| 186 | 2704 | 36 | 542 | 2126 | 192 | 2 | 24 | 1 | 20 | 1713 | 991 | 58 |
187 rows × 12 columns
df.isnull().sum()
Confirmed 0 Deaths 0 Recovered 0 Active 0 New cases 0 New deaths 0 New recovered 0 Deaths / 100 Cases 0 Recovered / 100 Cases 0 Confirmed last week 0 1 week change 0 1 week % increase 0 dtype: int64
scaler = StandardScaler()
scaler.fit(df)
X_scale = scaler.transform(df)
df_scale = pd.DataFrame(X_scale, columns=df.columns)
df_scale.head()
| Confirmed | Deaths | Recovered | Active | New cases | New deaths | New recovered | Deaths / 100 Cases | Recovered / 100 Cases | Confirmed last week | 1 week change | 1 week % increase | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.135676 | -0.158475 | -0.134087 | -0.113774 | -0.196126 | -0.158352 | -0.218755 | 0.286100 | 0.158547 | -0.127921 | -0.183926 | -0.474273 |
| 1 | -0.217768 | -0.238477 | -0.252461 | -0.150459 | -0.194195 | -0.191764 | -0.208006 | -0.003093 | -0.336225 | -0.220861 | -0.184517 | 0.139350 |
| 2 | -0.157361 | -0.166013 | -0.167623 | -0.122342 | -0.106576 | -0.175058 | -0.044145 | 0.286100 | 0.082428 | -0.163001 | -0.109080 | 0.180259 |
| 3 | -0.228160 | -0.245019 | -0.262699 | -0.159573 | -0.212983 | -0.241883 | -0.223054 | 0.864484 | 0.919734 | -0.230604 | -0.199001 | -0.433364 |
| 4 | -0.228048 | -0.245801 | -0.265657 | -0.156682 | -0.211578 | -0.233530 | -0.223054 | 0.286100 | -1.516064 | -0.231004 | -0.195242 | 0.548432 |
pca = PCA(n_components=7)
pca.fit(df_scale)
variance = pca.explained_variance_ratio_
var=np.cumsum(np.round(variance, 3)*100)
plt.figure(figsize=(12,6))
plt.ylabel('% Variance Explained')
plt.xlabel('# of Features')
plt.title('PCA Analysis')
plt.ylim(0,100.5)
plt.plot(var)
[<matplotlib.lines.Line2D at 0x24a04211910>]
pca = PCA(n_components=3)
pca.fit(df_scale)
pca_scale = pca.transform(df_scale)
pca_df = pd.DataFrame(pca_scale, columns=['pc1', 'pc2', 'pc3'])
print(pca.explained_variance_ratio_)
[0.66775024 0.11686303 0.09541686]
pca_df
| pc1 | pc2 | pc3 | |
|---|---|---|---|
| 0 | -0.473309 | -0.402960 | 0.405014 |
| 1 | -0.609440 | 0.357881 | 0.032721 |
| 2 | -0.397680 | 0.096930 | 0.165867 |
| 3 | -0.670486 | -0.867993 | 0.726603 |
| 4 | -0.611347 | 1.511533 | 0.402367 |
| ... | ... | ... | ... |
| 182 | -0.591708 | 0.944056 | -0.331312 |
| 183 | -0.634875 | -0.612005 | 1.856682 |
| 184 | -0.420787 | 0.788536 | 6.879038 |
| 185 | -0.596679 | 0.747688 | -0.305685 |
| 186 | -0.600317 | 2.443741 | -0.757878 |
187 rows × 3 columns
Scene = dict(xaxis = dict(title = 'PC1'),yaxis = dict(title = 'PC2'),zaxis = dict(title = 'PC3'))
trace = go.Scatter3d(x=pca_df.iloc[:,0], y=pca_df.iloc[:,1], z=pca_df.iloc[:,2], mode='markers',marker=dict(colorscale='Greys', opacity=0.3, size = 10, ))
layout = go.Layout(margin=dict(l=0,r=0),scene = Scene, height = 1000,width = 1000)
data = [trace]
fig = go.Figure(data = data, layout = layout)
fig.show()
db = DBSCAN(eps=0.2, min_samples=6).fit(pca_df)
labels = db.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)
print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(pca_df, labels))
Estimated number of clusters: 4 Estimated number of noise points: 139 Silhouette Coefficient: -0.238
Scene = dict(xaxis = dict(title = 'PC1'),yaxis = dict(title = 'PC2'),zaxis = dict(title = 'PC3'))
labels = db.labels_
trace = go.Scatter3d(x=pca_df.iloc[:,0], y=pca_df.iloc[:,1], z=pca_df.iloc[:,2], mode='markers',marker=dict(color = labels, colorscale='Viridis', size = 10, line = dict(color = 'gray',width = 5)))
layout = go.Layout(scene = Scene, height = 1000,width = 1000)
data = [trace]
fig = go.Figure(data = data, layout = layout)
fig.update_layout(title='DBSCAN clusters (4) Derived from PCA', font=dict(size=12,))
fig.show()
pca_eps_values = np.arange(0.1,3.5,0.1)
pca_min_samples = np.arange(2,7)
pca_dbscan_params = list((pca_eps_values, pca_min_samples))
pca_no_of_clusters = []
pca_sil_score = []
pca_epsvalues = []
pca_min_samp = []
for p in pca_dbscan_params:
pca_dbscan_cluster = DBSCAN(eps=p[0], min_samples=p[1]).fit(pca_df)
pca_epsvalues.append(p[0])
pca_min_samp.append(p[1])
pca_no_of_clusters.append(
len(np.unique(pca_dbscan_cluster.labels_)))
pca_sil_score.append(silhouette_score(pca_df, pca_dbscan_cluster.labels_))
pca_eps_min = list(zip(pca_no_of_clusters, pca_sil_score, pca_epsvalues, pca_min_samp))
pca_eps_min_df = pd.DataFrame(pca_eps_min, columns=['no_of_clusters', 'silhouette_score', 'epsilon_values', 'minimum_points'])
pca_eps_min_df
| no_of_clusters | silhouette_score | epsilon_values | minimum_points | |
|---|---|---|---|---|
| 0 | 157 | 0.142960 | 0.1 | 0.2 |
| 1 | 3 | 0.608914 | 2.0 | 3.0 |
db = DBSCAN(eps=0.1, min_samples=0.2).fit(pca_df)
labels = db.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)
print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(pca_df, labels))
Estimated number of clusters: 157 Estimated number of noise points: 0 Silhouette Coefficient: 0.143
Scene = dict(xaxis = dict(title = 'PC1'),yaxis = dict(title = 'PC2'),zaxis = dict(title = 'PC3'))
labels = db.labels_
trace = go.Scatter3d(x=pca_df.iloc[:,0], y=pca_df.iloc[:,1], z=pca_df.iloc[:,2], mode='markers',marker=dict(color = labels, colorscale='Viridis', size = 10, line = dict(color = 'gray',width = 5)))
layout = go.Layout(scene = Scene, height = 1000,width = 1000)
data = [trace]
fig = go.Figure(data = data, layout = layout)
fig.update_layout(title='DBSCAN clusters (157) Derived from PCA', font=dict(size=12,))
fig.show()
db = DBSCAN(eps=2, min_samples=3).fit(pca_df)
labels = db.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)
print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(pca_df, labels))
Estimated number of clusters: 2 Estimated number of noise points: 6 Silhouette Coefficient: 0.609
Scene = dict(xaxis = dict(title = 'PC1'),yaxis = dict(title = 'PC2'),zaxis = dict(title = 'PC3'))
labels = db.labels_
trace = go.Scatter3d(x=pca_df.iloc[:,0], y=pca_df.iloc[:,1], z=pca_df.iloc[:,2], mode='markers',marker=dict(color = labels, colorscale='Viridis', size = 10, line = dict(color = 'gray',width = 5)))
layout = go.Layout(scene = Scene, height = 1000,width = 1000)
data = [trace]
fig = go.Figure(data = data, layout = layout)
fig.update_layout(title='DBSCAN clusters (2) Derived from PCA', font=dict(size=12,))
fig.show()
np.unique(labels, return_counts=True)
(array([-1, 0, 1], dtype=int64), array([ 6, 176, 5], dtype=int64))